rm(list=ls())

# Load packages
library(haven)
library(tidyverse)
library(data.table)
library(reshape2)

source("code/config.R")
source(paste0(CODE,"functions/parameters.R"))

## Create distance dataset

sf_mpsz <- st_read(dsn = "data/master-plan-2014-subzone-boundary-web-shp", 
                   layer = "MP14_SUBZONE_WEB_PL")
centroids <- st_centroid(sf_mpsz$geometry)
centroids <- st_transform(centroids, "+proj=longlat +ellps=WGS84 +datum=WGS84")
coord<-st_coordinates(centroids)

df<-sf_mpsz$SUBZONE_N
df<-data.frame(coord)
df$SUBZONE_N<-sf_mpsz$SUBZONE_N

cross.join <- function(a, b) {
  idx <- expand.grid(seq(length=nrow(a)), seq(length=nrow(b)))
  cbind(a[idx[,1],], b[idx[,2],])
}

df1<-data.table(df %>% rename(sub_area_d=SUBZONE_N,	x1=X, y1=Y))
df2<-data.table(df %>% rename(sub_area_o=SUBZONE_N, x2=X, y2=Y) )
df<-cross.join(df1,df2)
df$spatial_dist<-distVincentyEllipsoid(df[,c("x1","y1")],  df[,c("x2","y2")])
distances<-df

## Input periods data sets
df <- read_dta(paste(DATA, "panelODAByWeekAdultsOnly.dta", sep=""))
df <- df %>% filter(period=="before") %>% select(-c(period,dt,id))

## collapse to take the mean over all weeks
df<-df %>% group_by(pHome, dp) %>% summarise(leisure= mean(totleis),leisure_w=mean(totleisw), commute=sum(totwork), commute_w=sum(totworkw)) %>% ungroup()
df<-df %>% rename(sub_area_o=pHome, sub_area_d=dp)

# Merge data sets

## rents
commercial_rents <- read_csv(paste(DATA, "REALIS_retail_rent2015.csv", sep=""))
commercial_rents <- commercial_rents %>% dplyr::select(sub_area_d=X1,commercial_rent=Median)
commercial_rents$sub_area_d<- toupper(commercial_rents$sub_area_d)

## retail land use
commercial_quant<- read_csv(paste(DATA, "REALIS_retail_stock.csv", sep=""))
commercial_quant[is.na(commercial_quant)]<-0
commercial_quant <- commercial_quant %>%  dplyr::select(sub_area_d=X1,commercial_quant=`2015Q4`)
commercial_quant$sub_area_d<- toupper(commercial_quant$sub_area_d)

## non-retail land use
office_quant<- read_csv(paste(DATA, "REALIS_nonretail_stock.csv", sep=""))
office_quant[is.na(office_quant)]<-0
office_quant <- office_quant %>%  dplyr::select(sub_area_d=X1,office_quant=`2015Q4`)
office_quant$sub_area_d<- toupper(office_quant$sub_area_d)

## residential rents
residential_rents <- read_csv(paste(DATA, "REALIS_residential_rent2015.csv", sep=""))
residential_rents <- residential_rents <- residential_rents %>% dplyr::select(sub_area_o=X1,residential_rent=Median)
residential_rents$sub_area_o<- toupper(residential_rents$sub_area_o)

## GHS
GHS<- read_csv(paste(DATA, "GHS", sep=""))

## combine datasets
df<-left_join(df,office_quant)
df<-left_join(df,commercial_quant)
df<-left_join(df,commercial_rents)
df<-left_join(df,residential_rents)
df<-left_join(df,distances)
df<-left_join(df,GHS)

## travel time
matTravelTimes <- read_dta(paste(DATA, "commuteTimesArea", sep=""))
matTravelTimes <- matTravelTimes %>% filter(after==0) %>% select(sub_area_o=op,sub_area_d=dp,t)  %>% group_by(sub_area_o,sub_area_d) %>% summarise(t=mean(t))
matTravelTimes_post <- matTravelTimes %>% filter(after==1) %>% select(sub_area_o=op,sub_area_d=dp,t_post=t)  %>% group_by(sub_area_o,sub_area_d) %>% summarise(t_post=mean(t_post))
df<-left_join(df,matTravelTimes)
df<-left_join(df,matTravelTimes_post)

# Subset population by low vs high income
df[is.na(df)]<-0
df <- df %>% mutate(commute=commute-commute_w,leisure=leisure-leisure_w)

# Drop neighborhoods that are not relevant
temp<-df %>% group_by(sub_area_o,residents_census,residents_census_w,avg_wage,avg_wage_w) %>% summarise(residents=sum(commute,na.rm=T))
temp2<-df %>% group_by(sub_area_d,commercial_quant,office_quant) %>% summarise(workers=sum(commute,na.rm=T)) %>% rename(sub_area_o=sub_area_d)
temp<-left_join(temp,temp2)
temp<- temp %>% filter(commercial_quant>0 & office_quant>0)
temp<- temp %>% filter(workers>1000 & residents>1000)
list_pa<- temp$sub_area_o

# Make calculations
df <- df %>% group_by(sub_area_o) %>% mutate(residents=sum(commute),residents_w=sum(commute_w)) %>% group_by(sub_area_d) %>% mutate(workers=sum(commute),workers_w=sum(commute_w)) %>% ungroup()
df <- df %>% filter((sub_area_d %in% list_pa) & (sub_area_o %in% list_pa))
df <- df %>% group_by(sub_area_o) %>% mutate(share_commute = commute/sum(commute), share_commute_w = commute_w/sum(commute_w),share_leisure = leisure/sum(leisure), share_leisure_w = leisure_w/sum(leisure_w)) %>% ungroup()
df$total_pop<-total_pop  
df$total_pop_w<- total_pop_w 
df <- df %>% group_by(sub_area_d) %>% mutate(share_resident = residents/sum(residents), share_resident_w = residents_w/sum(residents_w)) %>% mutate(workers=sum(total_pop*share_resident*share_commute),workers_w=sum(total_pop_w*share_resident*share_commute_w)) %>% ungroup()
df <- df %>% mutate(R=total_pop*share_resident,R_w=total_pop_w*share_resident)

write_csv(df,paste(DATA, "spatial_urban/working_data/clean_data.csv", sep=""))

# Construct data
df$land_share<-df$commercial_quant/(df$office_quant+ df$commercial_quant)
df <- df %>% group_by(sub_area_d,land_share) %>% summarise(commute=sum(commute),commute_w=sum(commute_w)) %>% ungroup()%>% mutate(commute=commute/sum(commute),commute_w=commute_w/sum(commute_w))
df$ratio<-df$commute_w*total_pop_w/(df$commute_w*total_pop_w+df$commute*total_pop)

# Plot: Figure A11
binsreg(df$ratio, df$land_share)
g<-binscatter2(formula="land_share ~ ratio", key_var = "ratio", data=df, bins=14, partial=FALSE)
g + xlab("Low Income Worker Share") + ylab("Non-Tradables Sector Land Share")


### Repeat for post-period ###

## Input periods data sets

df <- read_dta(paste(DATA, "panelODByWeekAdultsOnly.dta", sep=""))
df <- df %>% filter(dt=="2018-06-01") %>% select(-c(period,dt,id))

## collapse to take the mean over all weeks
df<-df %>% group_by(pHome, dp) %>% summarise(leisure= mean(totleis),leisure_w=mean(totleisw), commute=sum(totwork), commute_w=sum(totworkw)) %>% ungroup()
df<-df %>% rename(sub_area_o=pHome, sub_area_d=dp)

## rents
commercial_rents <- read_csv(paste(DATA, "REALIS_retail_rent2018.csv", sep=""))
commercial_rents <- commercial_rents %>% dplyr::select(sub_area_d=X1,commercial_rent=Median)
commercial_rents$sub_area_d<- toupper(commercial_rents$sub_area_d)

## retail land use
commercial_quant<- read_csv(paste(DATA, "/REALIS_retail_stock.csv", sep=""))
commercial_quant[is.na(commercial_quant)]<-0
commercial_quant <- commercial_quant %>%  dplyr::select(sub_area_d=X1,commercial_quant=`2018Q3`)
commercial_quant$sub_area_d<- toupper(commercial_quant$sub_area_d)

## non-retail land use
office_quant<- read_csv(paste(DATA, "/REALIS_nonretail_stock.csv", sep=""))
office_quant[is.na(office_quant)]<-0
office_quant <- office_quant %>%  dplyr::select(sub_area_d=X1,office_quant=`2018Q3`)
office_quant$sub_area_d<- toupper(office_quant$sub_area_d)

## residential rents
residential_rents <- read_csv(paste(DATA, "/REALIS_residential_rent2018.csv", sep=""))
residential_rents <- residential_rents <- residential_rents %>% dplyr::select(sub_area_o=X1,residential_rent=Median)
residential_rents$sub_area_o<- toupper(residential_rents$sub_area_o)

## combine datasets
df<-left_join(df,office_quant)
df<-left_join(df,commercial_quant)
df<-left_join(df,commercial_rents)
df<-left_join(df,residential_rents)
df<-left_join(df,distances)

## travel time
matTravelTimes <- read_dta(paste(DATA, "commuteTimesArea", sep=""))
matTravelTimes <- matTravelTimes %>% filter(after==1) %>% select(sub_area_o=op,sub_area_d=dp,t)  %>% group_by(sub_area_o,sub_area_d) %>% summarise(t=mean(t))
df<-left_join(df,matTravelTimes)

# Subset population by low vs high income
df[is.na(df)]<-0
df <- df %>% mutate(commute=commute-commute_w,leisure=leisure-leisure_w)

# Calculations
df <- df %>% group_by(sub_area_o) %>% mutate(residents=sum(commute),residents_w=sum(commute_w)) %>% group_by(sub_area_d) %>% mutate(workers=sum(commute),workers_w=sum(commute_w)) %>% ungroup()
df <- df %>% group_by(sub_area_o) %>% mutate(share_commute = commute/sum(commute), share_commute_w = commute_w/sum(commute_w),share_leisure = leisure/sum(leisure), share_leisure_w = leisure_w/sum(leisure_w)) %>% ungroup()

df <- df %>% group_by(sub_area_d) %>% mutate(share_resident = residents/sum(residents), share_resident_w = residents_w/sum(residents_w)) %>% mutate(workers=sum(total_pop*share_resident*share_commute),workers_w=sum(total_pop_w*share_resident*share_commute_w)) %>% ungroup()
df <- df %>% mutate(R=total_pop*share_resident,R_w=total_pop_w*share_resident)

write_csv(df,paste(DATA, "clean_data_post.csv", sep=""))




